Warning in checkDepPackageVersion(dep_pkg = "TMB"): Package version inconsistency detected.
glmmTMB was built with TMB version 1.9.6
Current TMB version is 1.9.10
Please re-install glmmTMB from source or restore original 'TMB' package (see '?reinstalling' for more information)
Preprocessing & Clustering
Loading the data
## if you haven't downloaded it yet:download.file("http://imlspenticton.uzh.ch/teaching/STA426/week13.SCE.rds", dest="week13.SCE.rds")# note that by default downloads in R timeout after 1min, so# when downloading large objects you might need to increase this limit:# options(timeout=1200) # timeout after 20min, instead of 1min...
sce <-readRDS("week13.SCE.rds")# Have a look at the objectsce
# get QC metrics:sce <-addPerCellQC(sce, subsets=list(Mt=mito), percent.top=c(5,10))sce <-addPerFeatureQC(sce)# we plot some of the metricsqc <-as.data.frame(colData(sce))ggplot(qc, aes(subsets_Mt_percent)) +geom_histogram() +facet_wrap(~sample_id)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# we set thresholds on the library sizes and detection rate:sce$qc.out <-isOutlier(log(sce$sum),nmads=3,batch=sce$sample_id) |isOutlier(log(sce$detected),nmads=3,batch=sce$sample_id)# a fancier job would be accomplished by https://github.com/wmacnair/SampleQCtable(sce$qc.out)
FALSE TRUE
12115 32
# get rid of seldom detected genessce <- sce[rowData(sce)$detected>=4,]# we flag doublets: doublets are droplets that capture more than one cell and this might be a problem when it captures more than one cell-typesce <-scDblFinder(sce, samples="sample_id", BPPARAM=MulticoreParam(6))table(sce$scDblFinder.class)
singlet doublet
11484 663
Normalization & reduction
# (fast) standard log-normalizationsce <-logNormCounts(sce)# get high-variable genes (normally we'd take 3000, here we'll go skim for speed)hvg <-getTopHVGs(sce, n=1000)# alternative: variance-stabilizing transformation# vst <- suppressWarnings(sctransform::vst(counts(sce)))# logcounts(sce) <- vst$y# # get highly-variable genes# hvg <- row.names(sce)[order(vst$gene_attr$residual_variance,# decreasing=TRUE)[1:1000]]# run PCAsce <-runPCA(sce, ncomponents=50, subset_row=hvg)#How many componentes do you take? before the used the elbo, but it is not a good way to proceed. # check the variance explained by the PCs:pc.var <-attr(reducedDim(sce),"percentVar")plot(pc.var, xlab="PCs", ylab="% variance explained")
# restrict to the first 20 components:reducedDim(sce) <-reducedDim(sce)[,1:20]# run and plot 2d projections based on the PCA# to improve performance, use Annoy kNN approximation:sce <-runTSNE(sce, dimred="PCA", BNPARAM=AnnoyParam())sce <-runUMAP(sce, dimred="PCA", n_neighbors=30, BNPARAM=AnnoyParam())# you can compare the 2d embeddings:# sleepwalk::sleepwalk( as.list(reducedDims(sce)[c("TSNE","UMAP")]),# featureMatrices=reducedDims(sce)[["PCA"]] )#it is an impossible task to do not introduce distorsions in these methods of dimensionality reduction. Because to perserve the neighbours of one cell, you need to disturb the others. # plot by doublet scoreplotUMAP(sce, colour_by="scDblFinder.score") +plotTSNE(sce, colour_by="scDblFinder.score") +plot_layout(guides ="collect")
# filter out bad cellssce <- sce[,sce$scDblFinder.class!="doublet"&!sce$qc.out]
Both representations are capturing the same time of relationships (you can’t not really observe it here but yes in the plot he presented with more colours)
Batch correction
First we want to check whether there’s a need for integration/batch correction. This can be assessed by the extent to which the samples are well-mixed.
## a much better readout (but long to compute) for cell mixing would be given ## by the CellMixS package:# sce <- cms(sce, k=50, group = "sample_id", BPPARAM=MulticoreParam(6))# plotTSNE(sce, colour_by="cms_smooth")
# batch corrected using the mutual nearest neighborssce2 <-fastMNN(sce, batch=sce$sample_id, BNPARAM=AnnoyParam())# (see Harmony or Seurat integration for "stronger" integration methods)# we take the corrected PCAreducedDim(sce, type="PCA") <-reducedDim(sce2)[,1:20]# and recompute the 2d projectionssce <-runTSNE(sce, dimred="PCA", BNPARAM=AnnoyParam())sce <-runUMAP(sce, dimred="PCA", n_neighbors=25, BNPARAM=AnnoyParam())plotUMAP(sce, colour_by="group_id") +plotUMAP(sce, colour_by="sample_id")
Integration from the SEURAT packaged or harmony are useful to stronger batch-effect corrections.
# we could play with the `resolution_parameter` of `cluster_leiden` # to decide on the granularityplotTSNE(sce, colour_by="cluster", text_by="cluster") +plotUMAP(sce, colour_by="cluster", text_by="cluster")
On the slide 10/11, we can see that as increasing resolution, some clusters divide into others. That indicates that clustering might not have the best quaility.
Cluster abundances across samples
ca <-table(cluster=sce$cluster, sample=sce$sample_id)ggplot(as.data.frame(ca), aes(reorder(sample), cluster, fill=Freq)) +geom_tile() +geom_text(aes(label=Freq)) +scale_fill_viridis_c()
In the case of our data, we don’t really expect a lot of differencies in abundance.
Cluster annotation
De-novo marker identification
# we identify genes that differ between clusters:mm <- scran::findMarkers(sce, groups=sce$cluster, test.type="binom",BPPARAM=MulticoreParam(6))# we select the top 5 markers by cluster:markers <-unique(unlist(lapply(mm, FUN=function(x){head(row.names(x[x$FDR<0.01,]),5)})))
Known markers
This comes from literature information. What is a good marker as protein and stains might not be a good marker at RNA level.
genes <-list(astrocytes =c("Aqp4", "Gfap", "Fgfr3","Dio2"),endothelial =c("Cldn5","Nostrin","Flt1"),microglia =c("C1qb","Tyrobp","P2ry12", "Csf1r"),neuronal =c("Snap25", "Stmn2", "Syn1", "Rbfox3"),excNeuron =c("Slc17a7","Camk2a","Grin2b","Fezf2"),inhNeuron =c("Gad1","Lhx6","Adarb2"),oligodendro =c("Opalin","Plp1","Mag","Mog"),OPC =c("Pdgfra","Sox6","Bcan"))# since the row.names of the object have also the ensembl id, we find the matching row names for each gene:km <-lapply(genes, FUN=function(g) grep(paste0(g, "$", collapse="|"), rownames(sce), value=TRUE))
# heatmap for the de-novo markers:sechm(pb, markers, assayName ="propOfMax", show_colnames=TRUE,do.scale=FALSE, hmcols=viridis::viridis(100), row_names_gp=gpar(fontsize=9))
For simplicity, here rather than working on all clusters we’ll group them by broad cell class. As a very rough approximation to manual annotation, we will assign clusters to the cell type whose markers are the most expressed:
# we get rid of the unspecific neuronal markers:km2 <- km[names(km)!="neuronal"]# we extract the pseudo-bulk counts of the markers for each clustermat <-assay(pb)[unlist(km2),]# we aggregate across markers of the same typemat <-aggregate(t(scale(t(mat))),by=list(type=rep(names(km2), lengths(km2))),FUN=sum)# for each column (cluster), we select the row (cell type) which has the maximum aggregated valuecl2 <- mat[,1][apply(mat[,-1], 2, FUN=which.max)]# we convert the cells' cluster labels to cell type labels:sce$cluster2 <- cl2[sce$cluster]table(sce$cluster, sce$cluster2)
# we aggregate again to pseudo-bulk using the new clusterspb <-aggregateData(sce, "logcounts", by=c("cluster2"), fun="mean")assayNames(pb) <-"logcounts"# and we plot again the expression of the markers as a sanity checkrowData(pb)$marker4 <-NArowData(pb)[unlist(km),"marker4"] <-rep(names(km),lengths(km))sechm(pb, c(unlist(km)), assayName="logcounts", gaps_row="marker4",show_colnames=TRUE, do.scale=TRUE, breaks=1, row_title_rot=0)
# we aggregate by cluster x sample to perform pseudo-bulk differential state analysissce <- muscat::prepSCE(sce, kid="cluster2", sid ="sample_id",gid ="group_id")pb <-aggregateData(sce)pbMDS(pb)
# we run edgeR on each cluster and extract the resultsres <-pbDS(pb)
# top genes in a given cell typepbHeatmap(sce, res, k="astrocytes", top_n =5)
# we extract all differentially-expressed genes:degs <-unique(res2[res2$p_adj.loc<0.05,"gene"])# we flatten the pb object (putting all cell types in the same assay) and calculate foldchangespb2 <-pbFlatten(pb)# we add a logFC assay:pb2 <- sechm::log2FC(pb2, fromAssay="logcpm", controls=pb2$group_id=="WT", by=pb2$cluster_id)# we reorderpb2 <- pb2[,order(pb2$cluster_id, pb2$group_id)]# we plot a heatmap of the logFC of the top 200 genes across all cell typessechm(pb2, head(degs,200), assayName="log2FC", gaps_at="cluster_id",column_title_gp=gpar(fontsize=9))
The number of differentially expressed genes it is a bad signature for response to treatment when we have substantial differencies in abundances.